# code to reproduce tables/figures
# for pivot scaling output/interpretation

# note: some of the data necessary to run this code contains sensitive information,
# and is therefore omitted
# sections that will not run are noted in comments
library(here)

here::here()

source("code/startup.R")
library(parrot)

get_keywords_custom <- function (scores, n_dimensions, n_words = 15, stretch = 3, capture_output = FALSE, 
                                 pivots_only = TRUE, topic) 
{
  all_keywords <- list()
  if (stretch%%2 != 1) 
    stop("Please enter odd integer for \"stretch\"")
  for (i in if (length(n_dimensions) == 1) {
    1:n_dimensions
  }
  else {
    n_dimensions
  }) {
    general_keywords <- scores$vocab[order(scores$pivot_scores[, 
                                                               i + 1] * sqrt(rowSums(scores$pivot_scores[, -1]^2)), 
                                           decreasing = TRUE)]
    specific_keywords <- scores$vocab[order(scores$word_scores[, 
                                                               i + 1]^(stretch) * sqrt(rowSums(scores$pivot_scores[, 
                                                                                                                   -1]^2)), decreasing = TRUE)]
    if (pivots_only) {
      keywords <- data.frame(head(rev(general_keywords), 
                                  n = n_words), head(general_keywords, n = n_words))
      names(keywords) <- c("pivots (-)", "(+) pivots")
    }
    else {
      keywords <- data.frame(head(rev(specific_keywords), 
                                  n = n_words), head(rev(general_keywords), n = n_words), 
                             head(general_keywords, n = n_words), head(specific_keywords, 
                                                                       n = n_words))
      names(keywords) <- c("scores (-)", "pivots (-)", 
                           "(+) pivots", "(+) scores")
    }
    if (capture_output) {
      all_keywords[[paste0("D", i)]] <- keywords
    }
    else {
      if (!requireNamespace("knitr", quietly = TRUE)) {
        cat("\nDimension", i, "keywords\n\n")
        print(keywords, row.names = F)
        cat("\n")
      }
      else {
        print(knitr::kable(keywords, align = "c", format = "pandoc", 
                           caption = paste("Dimension", i, "keywords: ", topic)))
        cat("\n")
      }
    }
  }
  if (capture_output) {
    return(all_keywords)
  }
}

# function to plot pivot scaling results
plot_keywords_custom <- function (scores, 
                                  scoredat = NULL,
                                  x_dimension = 1, y_dimension = 2, 
                                  q_cutoff = 0.9, 
                                  plot_density = FALSE, 
                                  plot_density2d = FALSE,
                                  plot_alpha = .7,
                                  density_by = "running",
                                  unstretch = FALSE, color = FALSE,
                                  subjname = "subject"){
  if(plot_density == TRUE & plot_density2d == TRUE){
    stop("plot_density and plot_density2d cannot both be true")
  }
  if(plot_density2d == TRUE & is.null(scoredat)){
    stop("must supply data for plotting 2d density")
  }

  if (unstretch) {
    scores$word_scores <- sweep(scores$word_scores, 1, 
                                sqrt(rowSums((scores$importance[-1] * scores$pivot_scores[, -1])^2)) + 1, `/`)
  }
  word_scores <- data.frame(scores$word_scores)
  word_counts <- scores$word_counts
  above_cutoff <- word_counts > quantile(word_counts, q_cutoff)
  x_dimension <- x_dimension + 1
  y_dimension <- y_dimension + 1
  if (color & !("color" %in% names(scores))) {
    scores$color <- factor(kmeans(scores$word_scores[, 2:11], 
                                  centers = 5)$cluster)
  }
  if (!color & !plot_density2d) {
    g <- ggplot2::ggplot() + 
      ggplot2::geom_text(data = word_scores[above_cutoff,], 
                         ggplot2::aes(x = word_scores[above_cutoff, x_dimension], 
                                      y = word_scores[above_cutoff, y_dimension], 
                                      label = scores$vocab[above_cutoff])) + 
      ggplot2::xlab(paste("Dimension:", x_dimension - 1)) + 
      ggplot2::ylab(paste("Dimension:", y_dimension - 1)) + 
      ggplot2::guides(size = "none") + ggplot2::theme_classic() + 
      ggplot2::xlim(-max(abs(word_scores[above_cutoff,  x_dimension])), 
                    max(abs(word_scores[above_cutoff, x_dimension]))) + 
      ggplot2::ylim(-max(abs(word_scores[above_cutoff, y_dimension])), 
                    max(abs(word_scores[above_cutoff, y_dimension])))+
      ggplot2::ggtitle(paste0("Embedded dimensions in Run for Something respondents' statements of interest"),
                       subtitle = paste0("Top ", 100*(1-q_cutoff), "% of words shown"))
  }
  if(!color & plot_density2d){
    
    xname <- paste0("X", x_dimension - 1)
    yname <- paste0("X", y_dimension - 1)
    
    g <- ggplot2::ggplot() + 
      ggplot2::geom_density_2d(data = scoredat,
                               ggplot2::aes(x = scoredat %>% pull(xname),
                                            y = scoredat %>% pull(yname),
                                            lty = scoredat %>% pull(density_by) %>% factor),
                               alpha = plot_alpha)+
      ggplot2::geom_text(data = word_scores[above_cutoff,], 
                         ggplot2::aes(x = word_scores[above_cutoff, x_dimension], 
                                      y = word_scores[above_cutoff, y_dimension], 
                                      label = scores$vocab[above_cutoff]),
                         alpha = plot_alpha) + 
      ggplot2::xlab(paste("Dimension:", x_dimension - 1)) + 
      ggplot2::ylab(paste("Dimension:", y_dimension - 1)) + 
      ggplot2::guides(size = "none") + ggplot2::theme_classic() + 
      ggplot2::xlim(-max(abs(word_scores[above_cutoff,  x_dimension])), 
                    max(abs(word_scores[above_cutoff, x_dimension]))) + 
      ggplot2::ylim(-max(abs(word_scores[above_cutoff, y_dimension])), 
                    max(abs(word_scores[above_cutoff, y_dimension])))+
      ggplot2::ggtitle(paste0("Embedded dimensions in Run for Something respondents' statements of interest"),
                       subtitle = paste0("Top ", 100*(1-q_cutoff), "% of words plotted by word scores in two dimensions\nOverlaid on document score densities for candidates and non-candidates in same dimensions"))
  }
  else {
    g <- ggplot2::ggplot() + 
      ggplot2::geom_text(data = word_scores[above_cutoff, ], 
                         ggplot2::aes(x = word_scores[above_cutoff, x_dimension], 
                                      y = word_scores[above_cutoff, y_dimension], 
                                      label = scores$vocab[above_cutoff], 
                                      color = scores$color[above_cutoff])) + 
      ggplot2::xlab(paste("Dimension:", x_dimension - 1)) + 
      ggplot2::ylab(paste("Dimension:", y_dimension - 1)) + 
      ggplot2::guides(size = "none", color = "none") + ggplot2::theme_classic() + 
      ggplot2::xlim(-max(abs(word_scores[above_cutoff, 
                                         x_dimension])), 
                    max(abs(word_scores[above_cutoff, x_dimension]))) + 
      ggplot2::ylim(-max(abs(word_scores[above_cutoff, y_dimension])), 
                    max(abs(word_scores[above_cutoff,  y_dimension])))+
      ggplot2::ggtitle(paste0("Embedded dimensions in Run for Something respondents' statements of interest"),
                       subtitle = paste0("Top ", 100*(1-q_cutoff), "% of words shown"))
  }
  if (!plot_density) {
    return(g)
  }else{
    gridExtra::grid.arrange(g, ggplot2::ggplot() + 
                              ggplot2::geom_density(ggplot2::aes(x = word_scores[, x_dimension])) +
                              ggplot2::xlab(paste("Dimension:", x_dimension - 1)) + 
                              ggplot2::theme_classic(), 
                            ggplot2::ggplot() + 
                              ggplot2::geom_density(ggplot2::aes(x = word_scores[,  y_dimension])) + 
                              ggplot2::xlab(paste("Dimension",  y_dimension - 1)) + 
                              ggplot2::theme_classic(), 
                            layout_matrix = rbind(c(1, 1, 2), c(1, 1, 3)))
  }
}

plot_dist <- function(data = rfs_parrot_scores, 
                      dim = 1, 
                      dimname = "General Interest (-) vs. Specific Issues (+)",
                      byvar = NULL,
                      byname = NULL){
  data$out <- data %>% pull(paste0("X", dim))
  data <- data %>% filter(!is.na(out))
  
  if(!is.null(byvar)){
    data$byvar <- data %>% pull(byvar)
    
    plot <-  
      data %>%
      ggplot(aes(x = out, lty = factor(byvar)))+
      geom_density()+
      labs(x = "Document Score", y = "Density",
           title = paste0("Distribution of Document Scores by ", byname),
           subtitle = paste0("For Dimension ", dim, ": ", dimname))
  }else{
    plot <-  
      data %>%
      ggplot(aes(x = out))+
      geom_density()+
      labs(x = 'Document Score', y = "Density",
           title = paste0("Distribution of Document Scores"),
           subtitle = paste0("For Dimension ", dim, ": ", dimname))
  }
  return(plot)
}

rfs_parrot_scores <- readr::read_csv("data/dat_scored.csv")
rfs_parrot_scores_nocutoff <- readr::read_csv("data/dat_scored_nocutoffs.csv")


#load("pivot scaling model contains sensitive information")
#load("pivot scaling model (with no cutoff documents) contains sensitive information")

# rfs_parrot and rfs_parrot_nocutoff are pivot scaling models
# omitted from replication materials because they contain sensitive information
g <- get_keywords_custom(rfs_parrot, n_dimensions = 20, 
                         n_words = 10,
                         topic = "Why Run?",
                         capture_output = TRUE)
g2 <- get_keywords_custom(rfs_parrot_nocutoff, n_dimensions = 20, 
                         n_words = 10,
                         topic = "Why Run?",
                         capture_output = TRUE)

# main coefs
pivots_df <- bind_cols(g$D1, g$D2, g$D3)
names(pivots_df) <- paste0("Dimension ", c("1 (-)", "1 (+)",
                                           "2 (-)", "2 (+)",
                                           "3 (-)", "3 (+)"))
write.csv(pivots_df, file = "output/pivots_df.csv")  

# SI coefs
lapply(1:20, function(x){
  df <- g[[x]]
  names(df) <- paste0("Dimension ", x, c(" (-)"," (+)"))
  write.csv(df, file = paste0("output/pivot_dims/main/pivot_words_dim", x, ".csv"))
})

lapply(1:20, function(x){
  df <- g2[[x]]
  names(df) <- paste0("Dimension ", x, c(" (-)"," (+)"))
  write.csv(df, file = paste0("output/pivot_dims/no_cutoffs/pivot_words_dim", x, ".csv"))
})

# Candidate.Intake.Form..why column has been omitted from replication data
 # as it contains sensitive information
rfs_parrot_scores %>%
  arrange(desc(X1)) %>%
  slice(1:10) %>%
  pull(Candidate.Intake.Form..why)

# Candidate.Intake.Form..why column has been omitted from replication data
  # as it contains sensitive information
rfs_parrot_scores %>%
  arrange(X1) %>%
  slice(1:10) %>%
  pull(Candidate.Intake.Form..why)

# the fitted model used to produce keyword plots
# has been omitted from replication data
# as it contains sensitive information
keywords_plot <- plot_keywords_custom(rfs_parrot,
                                      q_cutoff = .95,
                                      subjname = "Why Run?")+
  theme_jg()
ggsave(keywords_plot, file = "figures/topwords_plot_d1d2.png", width = 12, height = 8)

keywords_plot_1_3 <- plot_keywords_custom(rfs_parrot, 
                                      x_dimension = 1, y_dimension = 3,
                                      q_cutoff = .95,
                                      subjname = "Why Run?")+
  theme_jg()
ggsave(keywords_plot_1_3, file = "figures/topwords_plot_d1d3.png", width = 12, height = 8)


p2 <- plot_keywords_custom(rfs_parrot, q_cutoff = .95,
                           x_dimension = 1, y_dimension = 2,
                           subjname = "Why Run?")+
  labs(x = "Dimension 1\n(General Interest - Specific Issues)",
       y = "Dimension 2\n(Core Values - Political Opportunities)")+
  theme_jg()
ggsave(p2, file = paste0("figures/topwords_d1d2_main.png"), width = 12, height = 8)

p3 <- plot_keywords_custom(rfs_parrot, q_cutoff = .95,
                           x_dimension = 1, y_dimension = 3,
                           subjname = "Why Run?")+
  labs(x = "Dimension 1\n(General Interest - Specific Issues)",
       y = "Dimension 3\n(Progressive Populism - Personal Background)")+
  theme_jg()
ggsave(p3, file = paste0("figures/topwords_d1d3_main.png"), width = 12, height = 8)

# all keywords plots
lapply(2:20, function(x){
  p <- plot_keywords_custom(rfs_parrot, q_cutoff = .95,
                                        x_dimension = 1, y_dimension = x,
                                        subjname = "Why Run?")+
    theme_jg()
  ggsave(p, file = paste0("figures/topwords/main/topwords_plot_d1d",x,".png"), width = 12, height = 8)
  
})

lapply(2:20, function(x){
  p <- plot_keywords_custom(rfs_parrot_nocutoff, q_cutoff = .95,
                            x_dimension = 1, y_dimension = x,
                            subjname = "Why Run?")+
    theme_jg()
  ggsave(p, file = paste0("figures/topwords/no_cutoffs/topwords_nocutoffs_plot_d1d",x,".png"), width = 12, height = 8)
  
})


dim1_dist_byrun <- plot_dist(byvar = "running", byname = "Candidate Status")+
  scale_linetype_discrete(name = "",
                          breaks = c(0,1),
                          labels = c("Non-Candidate",
                                     "Candidate"))+
  theme_jg()
ggsave(dim1_dist_byrun, file = "figures/dim1_dist_byrun.png", width = 8, height = 4)


dim2_dist_byrun <- plot_dist(dim = 2,
                             dimname = "Core Values (-) vs. Political Opportunities (+)",
                             byvar = "running", 
                             byname = "Candidate Status")+
  scale_linetype_discrete(name = "",
                          breaks = c(0,1),
                          labels = c("Non-Candidate",
                                     "Candidate"))+
  theme_jg()
ggsave(dim2_dist_byrun, file = "figures/dim2_dist_byrun.png", width = 8, height = 4)

dim3_dist_byrun <- plot_dist(dim = 3,
                             dimname = "Progressive Populism (-) vs. Personal Background (+)",
                             byvar = "running", 
                             byname = "Candidate Status")+
  scale_linetype_discrete(name = "",
                          breaks = c(0,1),
                          labels = c("Non-Candidate",
                                     "Candidate"))+
  theme_jg()
ggsave(dim3_dist_byrun, file = "figures/dim3_dist_byrun.png", width = 8, height = 4)

doc_2d_dist_byrun <- 
  rfs_parrot_scores %>%
  filter(!is.na(X1)) %>%
  ggplot(aes(x = X1, y = X2, lty = factor(running)))+
  geom_density2d(col = "black")+
  scale_linetype_manual(name = "",
                          breaks = c(0,1),
                        values = c("dotted","solid"),
                          labels = c("Non-Candidates",
                                     "Candidates"))+
  labs(x = "Document Score: Dimension 1 (General Interest - Specific Issues)",
       y = "Document Score: Dimension 2 (Core Values - Political Opportunities)",
       title = "Document Score Distributions by Candidate Status",
       subtitle = "Densities in first two dimensions shown")+
  theme_jg()
ggsave(doc_2d_dist_byrun, file = "figures/dist_byrun_d1d2.png", width = 8, height = 6)

doc_densities_123 <- 
  rfs_parrot_scores %>%
  filter(!is.na(X1)) %>%
  dplyr::select(X1, X2, X3, running) %>%
  reshape2::melt(id.vars = c("X1","running")) %>%
  ggplot(aes(x = X1, y = value, lty = factor(running)))+
  facet_wrap(~variable, nrow = 2, ncol = 1,
             labeller = as_labeller(c("X2" = "Dimension 2\n(Core Values - Political Opportunities)",
                                      "X3" = "Dimension 3\n(Progressive Populism - Personal Background)")),
             strip.position = "left")+
  geom_density2d(col = "black")+
  scale_linetype_manual(name = "",
                        breaks = c(0,1),
                        values = c("dotted","solid"),
                        labels = c("Non-Candidates",
                                   "Candidates"))+
  labs(x = "Document Score: Dimension 1 (General Interest - Specific Issues)",
       y = "Document Score: Comparison Dimension",
       title = "Document Score Distributions by Candidate Status",
       subtitle = "Density in first dimension plotted against densities in second and third dimensions",
       caption = "Differences in mean document scores between candidates and non-candidates\nfor Dimensions 1-3 are each significant at p < .001")+
  theme_bw()+
  theme(text = element_text(family = "serif", size = 16),
        strip.text = element_text(face = "bold"),
        plot.title = element_text(face = "bold", size = 20),
        plot.caption = element_text(hjust = 0))
ggsave(doc_densities_123, file = "figures/dist_byrun_d123.png", width = 10, height = 10)

# test if candidates and non-candidates differ along each dimension
t.test(rfs_parrot_scores$X1[rfs_parrot_scores$running == 1],
       rfs_parrot_scores$X1[rfs_parrot_scores$running == 0])
t.test(rfs_parrot_scores$X2[rfs_parrot_scores$running == 1],
       rfs_parrot_scores$X2[rfs_parrot_scores$running == 0])
t.test(rfs_parrot_scores$X3[rfs_parrot_scores$running == 1],
       rfs_parrot_scores$X3[rfs_parrot_scores$running == 0])
